Find tokens


In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from collections import Counter
from nltk.tokenize.punkt import PunktLanguageVars

In [2]:
phi_list = assemble_phi5_author_filepaths()
p = PunktLanguageVars()
j = JVReplacer()

In [3]:
all_que_tokens = []
all_ne_tokens = []
all_ue_tokens = []
for file in phi_list:
    with open(file) as f:
        r = f.read()
    text = phi5_plaintext_cleanup(r)
    chars = [chars for chars in text if chars not in [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}', '*']]
    text = ''.join(chars)
    tokens = p.word_tokenize(text.lower())
    tokens = [j.replace(word) for word in tokens]
    que_tokens = [word for word in tokens if word[-3:] == 'que']
    ne_tokens = [word for word in tokens if word[-2:] == 'ne']
    ue_tokens = [word for word in tokens if word[-2:] == 'ue' and word[-3:] != 'que']
    all_que_tokens += que_tokens
    all_ne_tokens += ne_tokens
    all_ue_tokens += ue_tokens

Get OLD headwords

Get the headwords from the OLD which we will use to compare the suspect enclitics. If there is a match to a headword, then the word is not an enclitic.


In [4]:
from lxml import etree
from io import StringIO
import os
import re

In [5]:
old_path = os.path.expanduser('~/cltk_data/latin/lexicon/latin_lexica_perseus/latin_english_lexicon_old.xml')
with open(old_path) as f:
    old_xml = f.read()
root = etree.parse(StringIO(old_xml))

In [6]:
entries = root.xpath('/TEI.2/text/body/div0/entryFree')

In [7]:
print(len(entries))


51594

In [8]:
comp = re.compile('[0-9]|_|\^')

old_entries = []
for x in entries:
    headword = x.get('key')
    headword = comp.sub('', headword).lower()
    old_entries.append(headword)

In [9]:
print(len(old_entries))
old_entries = set(old_entries)
print(len(old_entries))


51594
49381

-que


In [11]:
print('Number words:', len(all_que_tokens))
print('Number unqiue words:', len(set(all_que_tokens)))
print(all_que_tokens[:30])


Number words: 182368
Number unqiue words: 27960
['usque', 'usque', 'utriusque', 'absque', 'neque', 'quoque', 'neque', 'neque', 'absque', 'quoque', 'uniuscuiusque', 'neque', 'neque', 'utrumque', 'absque', 'magisque', 'absque', 'absque', 'quinque', 'usque', 'quoque', 'neque', 'neque', 'quinque', 'quoque', 'utraque', 'doctique', 'rectorque', 'satorque', 'diuersasque']

In [12]:
counter = Counter(all_que_tokens)
mc = counter.most_common(30000)

In [13]:
len(mc)


Out[13]:
27960

In [14]:
known_que_exceptions = []
for pair in mc:
    suspect_enclitic = pair[0]
    if suspect_enclitic in old_entries:
        known_que_exceptions.append(suspect_enclitic)

In [15]:
len(known_que_exceptions)


Out[15]:
65

In [16]:
print(known_que_exceptions)


['atque', 'neque', 'quoque', 'itaque', 'usque', 'denique', 'quisque', 'namque', 'quinque', 'utique', 'plerumque', 'aeque', 'undique', 'cumque', 'plerique', 'utroque', 'uterque', 'ubique', 'utrimque', 'quocumque', 'quicumque', 'quaque', 'quacumque', 'que', 'quandoque', 'ubicumque', 'deque', 'utcumque', 'unusquisque', 'quotienscumque', 'inique', 'quousque', 'usquequaque', 'qualiscumque', 'utrasque', 'quantumcumque', 'oblique', 'absque', 'quandocumque', 'utrubique', 'quotiensque', 'antique', 'simulatque', 'quicque', 'undecumque', 'peraeque', 'utrobique', 'adusque', 'hucusque', 'adaeque', 'quomodocumque', 'quotcumque', 'quantuscumque', 'abusque', 'donique', 'inseque', 'circumundique', 'propinque', 'praecoque', 'quantuluscumque', 'longinque', 'conseque', 'utercumque', 'quotusquisque', 'quescumque']

-ne


In [17]:
print('Number words:', len(all_ne_tokens))
print('Number unqiue words:', len(set(all_ne_tokens)))
print(all_ne_tokens[:30])


Number words: 94747
Number unqiue words: 4179
['sine', 'contemplatione', 'sollicitudine', 'declinatione', 'effusione', 'paene', 'declinatione', 'occasione', 'ne', 'passione', 'semine', 'semine', 'sine', 'coctione', 'pane', 'bene', 'paene', 'declinatione', 'laxatione', 'pane', 'paene', 'remissione', 'defectione', 'sine', 'sine', 'declinatione', 'flegmone', 'inflammatione', 'passione', 'abstine']

In [18]:
counter = Counter(all_ne_tokens)
mc = counter.most_common(30000)

In [19]:
len(mc)


Out[19]:
4179

In [20]:
known_ne_exceptions = []
for pair in mc:
    suspect_enclitic = pair[0]
    if suspect_enclitic in old_entries:
        known_ne_exceptions.append(suspect_enclitic)

In [21]:
len(known_ne_exceptions)


Out[21]:
166

-ue


In [22]:
print('Number words:', len(all_ue_tokens))
print('Number unqiue words:', len(set(all_ue_tokens)))
print(all_ue_tokens[:30])


Number words: 14352
Number unqiue words: 2444
['siue', 'pingue', 'ioue', 'quaue', 'siue', 'plaustraue', 'siue', 'siue', 'naue', 'siue', 'ioue', 'breue', 'maiusue', 'minusue', 'summoue', 'quantoue', 'superisue', 'leue', 'uentosue', 'fidamue', 'praecipue', 'assidue', 'niue', 'niue', 'niue', 'siue', 'siue', 'quidue', 'graue', 'tenue']

In [23]:
counter = Counter(all_ue_tokens)
mc = counter.most_common(30000)

In [24]:
len(mc)


Out[24]:
2444

In [25]:
known_ue_exceptions = []
for pair in mc:
    suspect_enclitic = pair[0]
    if suspect_enclitic in old_entries:
        known_ue_exceptions.append(suspect_enclitic)

In [26]:
len(known_ne_exceptions)


Out[26]:
166

Assemble final exeptions list


In [27]:
all_exceptions = known_que_exceptions + known_ne_exceptions + known_ue_exceptions
print(len(all_exceptions))


251

In [28]:
print(all_exceptions)


['atque', 'neque', 'quoque', 'itaque', 'usque', 'denique', 'quisque', 'namque', 'quinque', 'utique', 'plerumque', 'aeque', 'undique', 'cumque', 'plerique', 'utroque', 'uterque', 'ubique', 'utrimque', 'quocumque', 'quicumque', 'quaque', 'quacumque', 'que', 'quandoque', 'ubicumque', 'deque', 'utcumque', 'unusquisque', 'quotienscumque', 'inique', 'quousque', 'usquequaque', 'qualiscumque', 'utrasque', 'quantumcumque', 'oblique', 'absque', 'quandocumque', 'utrubique', 'quotiensque', 'antique', 'simulatque', 'quicque', 'undecumque', 'peraeque', 'utrobique', 'adusque', 'hucusque', 'adaeque', 'quomodocumque', 'quotcumque', 'quantuscumque', 'abusque', 'donique', 'inseque', 'circumundique', 'propinque', 'praecoque', 'quantuluscumque', 'longinque', 'conseque', 'utercumque', 'quotusquisque', 'quescumque', 'ne', 'sine', 'bene', 'sane', 'paene', 'plane', 'nonne', 'mane', 'commune', 'pone', 'impune', 'insigne', 'benigne', 'necne', 'pane', 'magne', 'anne', 'superne', 'opportune', 'digne', 'lene', 'immane', 'indigne', 'bone', 'maligne', 'plene', 'tisiphone', 'dione', 'urbane', 'sicine', 'procne', 'pene', 'erigone', 'alcyone', 'cyrene', 'germane', 'humane', 'hicine', 'segne', 'insane', 'syene', 'clymene', 'perenne', 'chione', 'condigne', 'cyllene', 'peropportune', 'messene', 'christiane', 'antigone', 'progne', 'amymone', 'persephone', 'pallene', 'oenone', 'fraterne', 'melpomene', 'inhumane', 'euadne', 'taprobane', 'helxine', 'hermione', 'pyrene', 'phryne', 'dine', 'serene', 'hicne', 'hucine', 'daphne', 'sabine', 'asine', 'cyane', 'commagene', 'concinne', 'obscene', 'chamaedaphne', 'theophane', 'nerine', 'diutine', 'perbene', 'inurbane', 'thyone', 'ismene', 'elephantine', 'amoene', 'anemone', 'stephane', 'torone', 'priene', 'arne', 'inferne', 'hippocrene', 'sophene', 'roxane', 'rhene', 'feminine', 'pirene', 'carcine', 'mnemosyne', 'nyctimene', 'susiane', 'pleione', 'pitane', 'mitylene', 'elatine', 'alsine', 'mytilene', 'matutine', 'oxymyrsine', 'peremne', 'hesione', 'absone', 'sithone', 'limone', 'acharne', 'hierabotane', 'euphrone', 'moene', 'zone', 'arachne', 'pellene', 'calymne', 'bizone', 'elleborine', 'impoene', 'corone', 'halcyone', 'paraetacene', 'istucine', 'chalbane', 'semiplene', 'masculine', 'acrisione', 'mesene', 'belone', 'praefiscine', 'consone', 'barine', 'inconcinne', 'aeschynomene', 'anadyomene', 'orphne', 'andrachne', 'pylene', 'prone', 'adhucine', 'hispane', 'aparine', 'importune', 'asiane', 'catacecaumene', 'chamaemyrsine', 'hedone', 'supine', 'myrmidone', 'nuncine', 'perindigne', 'prasiane', 'rhododaphne', 'euphrosyne', 'perbenigne', 'itone', 'patalene', 'bulbine', 'iasione', 'selene', 'praecipue', 'assidue', 'strenue', 'ambigue', 'sue', 'perspicue', 'congrue', 'incongrue', 'ingenue', 'exigue', 'fatue', 'continue', 'superflue', 'prospicue', 'mutue', 'fue', 'innocue', 'perexigue', 'supplicue', 'contigue']

In [ ]: